
# Replication Files

# "Foreign Policy Failures and the Attractiveness of Great Powers"
# by Rachel Myrick and William Marble, BJPolS 

# Text analysis of BBC articles 

out.dir <- "./" 

library(data.table)
library(tidyverse)
library(tidytext)
library(SnowballC)
library(stm)


theme_set(theme_classic() + 
            theme(axis.text = element_text(size = 14), 
                  title = element_text(size = 16),
                  strip.text = element_text(size = 16),
                  legend.text = element_text(size = 14)))


set.seed(94107)
tr_date <- ymd("2021-08-15")


# Load data ---------------------------------------------------------------

fs <- setdiff(list.files("data/bbc-headlines/", pattern = "*.csv"), "completed-dates.csv")
dats <- lapply(fs, \(x) fread(paste0("data/bbc-headlines/", x)))
dat <- bind_rows(dats)
dat <- dat %>% 
  mutate(date_relative = as.numeric(as.Date(date) - tr_date))

# save full dataset
saveRDS(dat, "data/bbc-headlines/bbc-headlines.rds")


# data frame of distinct dates
dates <- dat %>% distinct(date_relative)


# Subset data -------------------------------------------------------------

## filter to news section ------------------------------------------
dat <- dat %>% 
  mutate(
    combined_headline = paste0(headline, " - ", description),
    date = as.Date(date),
    section = sub("^https?://www\\.bbc\\.com/([^/]+)/.*", "\\1", url)
  ) %>% 
  filter(section == "news") %>% 
  mutate(clean_headline = str_replace_all(combined_headline, "[[:punct:]]", " "))
  

## limit to articles about the US --------------------------------------


# keyword search in the `combined_headline` column for the following list of terms
dict <- c("united states", "usa", "us", "biden", "trump", "washington")

# generate regex, adding \b to single-word terms
dict_bounded <- ifelse(grepl("^[A-Za-z]+$", dict),
                       paste0("\\b", dict, "\\b"),
                       dict)
pattern <- str_c(dict_bounded, collapse = "|") %>%
  regex(ignore_case = TRUE)
dat <- dat %>% 
  mutate(about_us = str_detect(combined_headline, pattern))


# filter and save N
dat_us <- dat %>% 
  filter(about_us) %>% 
  mutate(docid = row_number())
cat(prettyNum(nrow(dat_us), big.mark = ","), 
file = file.path(out.dir, "Tables/bbc-n-usa.tex"))







# Text analysis --------------------------------------------------------
detect_number <- function(x) {
  grepl("\\b\\d{1,3}(?:,\\d{3})*(?:\\.\\d+)?\\b|\\b\\d+(?:\\.\\d+)?\\b", x)
}


## prep for topic modeling --------------------------------------------

docs <- dat_us %>%
  mutate(combined_headline = tm::removeNumbers(combined_headline)) %>% 
  mutate(combined_headline = tm::removePunctuation(combined_headline)) %>% 
  unnest_tokens(word, combined_headline) %>%
  anti_join(stop_words, by = "word") 
  
# stem words using snowballc - porter algorithm
docs <- docs %>%
  mutate(word = wordStem(word)) 

# create exploratory wordcloud...mostly covid
wordcloud::wordcloud(docs$word, random.order = FALSE, max.words = 50)

# create sparse document-term matrix for use with stm
dtm <- docs %>% 
  count(url, word) %>%
  cast_sparse(url, word, n)


## keyword search for "Afghanistan" -----------------------------------------

afg_keywords <- c("afghanistan", "afghan", "taliban", "kabul")

afg_pattern <- str_c(afg_keywords, collapse = "|") %>%
  regex(ignore_case = TRUE)
dat_us <- dat_us %>% 
  mutate(afg_keyword = str_detect(combined_headline, afg_pattern))

dat_sum <- dat_us %>% 
  group_by(afg_keyword, date) %>% 
  count(.drop = FALSE)

# fill in dates with 0's that were omitted in the count() step
dat_sum <- expand.grid(date = seq(min(dat_us$date), max(dat_us$date), by = "day"),
            afg_keyword = c(TRUE, FALSE)) %>%
  left_join(dat_sum, by = c("date", "afg_keyword")) %>%
  mutate(n = ifelse(is.na(n), 0, n))


plotdat <- dat_sum %>% filter(afg_keyword)
ggplot(dat_us) + 
  geom_vline(xintercept = as.numeric(tr_date), linetype = "dashed", color = "black") + 
  geom_smooth(aes(x = date, y = as.numeric(afg_keyword), group = date >= tr_date),
              method = "loess", se = FALSE) +
  geom_point(data = plotdat, aes(x = date, y = n / max(n)), alpha = 0.3) +
  labs(x = NULL, y = "Proportion of Articles") +
  scale_y_continuous(breaks = seq(0, 1, .25), labels = scales::percent,
                     sec.axis = sec_axis(transform = \(x) x * max(plotdat$n), 
                                         breaks = 0:10, name = "Daily Number of Articles") ) 
ggsave(file.path(out.dir, "Figures/bbc-afg-coverage-keywords.pdf"), width = 6, height = 4)



## search for number of topics ----------------------------------------------

tmp <- textProcessor(documents = dat_us$combined_headline,
                     metadata = data.frame(dat_us),
                     lowercase = TRUE,
                     removestopwords = TRUE,
                     removenumbers = TRUE,
                     removepunctuation = TRUE,
                     stem = TRUE,
                     wordLengths = c(3, Inf))
out <- prepDocuments(tmp$documents, tmp$vocab, tmp$meta)
searchk <- searchK(out$documents, out$vocab, K = c(5:20), 
                   proportion = .2, cores = 4, heldout.seed = 94107)

pdf(file.path(out.dir, "Figures/bbc-stm-diagnostics.pdf"), 
    width = 6, height = 6)
plot(searchk) 
dev.off()
# 9 topics seems reasonable -- local max in holdout likelihood and semantic coherence



## fit STM ---------------------------------------------------------------

# estimate with K = 9
stm_fit <- stm(dtm, K = 9, prevalence = ~ s(date_relative), data = dat_us)
summary(stm_fit)

pdf(file.path(out.dir, "Figures/bbc-stm-topics.pdf"), width = 8, height = 6)
plot(stm_fit, type = "summary", n = 6, labeltype = "frex")
dev.off()

s


# Estimate topic proportions over time
topic_prop <- as.data.frame(stm_fit$theta)
names(topic_prop) <- paste0("topic", 1:ncol(topic_prop))
topic_prop <- topic_prop %>% 
  rownames_to_column("docid") %>% 
  pivot_longer(-docid, names_to = "topic", values_to = "prob") %>% 
  mutate(docid = as.integer(docid)) %>% 
  mutate(topic = gsub("topic", "Topic ", topic)) %>% 
  mutate(topic = ifelse(topic == "Topic 6", "Topic 6 (Afghanistan)", topic))

topic_prop <- left_join(topic_prop, dat_us, by = "docid")


# topic 6 is about afg
ggplot(topic_prop %>% filter(topic == "Topic 6 (Afghanistan)")) + 
  aes(x = date, y = prob, group = (date_relative > 0)) +
  geom_vline(xintercept = as.numeric(tr_date), linetype = "dashed", color = "black") + 
  geom_smooth() + 
  labs(x = NULL, y = "Topic Prevalence", title = "STM: Afghanistan Prevalence")
ggsave(file.path(out.dir, "Figures/bbc-stm-afg-coverage.pdf"), width = 6, height = 4)


# plot all topics
ggplot(topic_prop ) + 
  aes(x = date, y = prob, group = (date_relative > 0)) +
  geom_vline(xintercept = as.numeric(tr_date), linetype = "dashed", color = "black") + 
  geom_smooth() + 
  labs(x = NULL, y = "Topic Prevalence", title = "STM: All Topic Prevalence") + 
  facet_wrap(~ str_wrap(topic, 8), ncol = 3)
ggsave(file.path(out.dir, "Figures/bbc-stm-all-topics-over-time.pdf"), width = 6, height = 6)




